{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Real-time inference with simultaneous essentia-tensorflow classifiers\n",
    "\n",
    "\n",
    "We are relying on soundcard to capture the audio loopback of the system.\n",
    "If not available it can be installed with `pip`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install pysoundcard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib nbagg\n",
    "\n",
    "import numpy as np\n",
    "from scipy.special import softmax\n",
    "import matplotlib.pyplot as plt\n",
    "import soundcard as sc\n",
    "\n",
    "from struct import unpack\n",
    "from IPython import display\n",
    "\n",
    "from essentia.streaming import *\n",
    "from essentia import Pool, run, array, reset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For this demo, we will use four of our transfer learning classifiers: `danceability`, `voice_instrumental`, `mood_aggressive`, and `mood_happy`. These and more models can be downloaded from [Essentia models](https://essentia.upf.edu/models.html) site."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget https://essentia.upf.edu/models/classifiers/danceability/danceability-musicnn-msd-1.pb\n",
    "!wget https://essentia.upf.edu/models/classifiers/voice_instrumental/voice_instrumental-musicnn-msd-1.pb \n",
    "!wget https://essentia.upf.edu/models/classifiers/mood_aggressive/mood_aggressive-musicnn-msd-1.pb \n",
    "!wget https://essentia.upf.edu/models/classifiers/mood_happy/mood_happy-musicnn-msd-1.pb "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parameters optimized for real-time inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model parameters\n",
    "inputLayer = 'model/Placeholder'\n",
    "outputLayer = 'model/Sigmoid'\n",
    "\n",
    "labels = ['danceability', 'voice_instrumental', 'aggressiveness', 'happiness']\n",
    "nLabels = len(labels)\n",
    "\n",
    "sampleRate = 16000\n",
    "frameSize = 512 \n",
    "hopSize = 256\n",
    "numberBands = 96\n",
    "\n",
    "# analysis parameters\n",
    "patchSize = 64\n",
    "displaySize = 10\n",
    "\n",
    "bufferSize = patchSize * hopSize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Instantiate and connect the algorithms "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "buffer = np.zeros(bufferSize, dtype='float32')\n",
    "\n",
    "vimp = VectorInput(buffer)\n",
    "\n",
    "fc = FrameCutter(frameSize=frameSize, hopSize=hopSize)\n",
    "\n",
    "tim = TensorflowInputMusiCNN()\n",
    "\n",
    "vtt = VectorRealToTensor(shape=[1, 1, patchSize, numberBands],\n",
    "                         lastPatchMode='discard')\n",
    "\n",
    "ttp = TensorToPool(namespace=inputLayer)\n",
    "\n",
    "tfp_danceability = TensorflowPredict(graphFilename='danceability-musicnn-msd-1.pb',\n",
    "                        inputs=[inputLayer],\n",
    "                        outputs=[outputLayer])\n",
    "tfp_voice_instrumental = TensorflowPredict(graphFilename='voice_instrumental-musicnn-msd-1.pb',\n",
    "                        inputs=[inputLayer],\n",
    "                        outputs=[outputLayer])\n",
    "tfp_aggressive = TensorflowPredict(graphFilename='mood_aggressive-musicnn-msd-1.pb',\n",
    "                        inputs=[inputLayer],\n",
    "                        outputs=[outputLayer])\n",
    "tfp_happy = TensorflowPredict(graphFilename='mood_happy-musicnn-msd-1.pb',\n",
    "                        inputs=[inputLayer],\n",
    "                        outputs=[outputLayer])\n",
    "\n",
    "ptt_danceability = PoolToTensor(namespace=outputLayer)\n",
    "ptt_voice_instrumental = PoolToTensor(namespace=outputLayer)\n",
    "ptt_aggressive = PoolToTensor(namespace=outputLayer)\n",
    "ptt_happy = PoolToTensor(namespace=outputLayer)\n",
    "\n",
    "ttv_danceability = TensorToVectorReal()\n",
    "ttv_voice_instrumental = TensorToVectorReal()\n",
    "ttv_aggressive = TensorToVectorReal()\n",
    "ttv_happy = TensorToVectorReal()\n",
    "\n",
    "pool = Pool()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "vimp.data   >> fc.signal\n",
    "fc.frame    >> tim.frame\n",
    "tim.bands   >> vtt.frame\n",
    "tim.bands   >> (pool, 'melbands')\n",
    "vtt.tensor  >> ttp.tensor\n",
    "\n",
    "ttp.pool  >> tfp_danceability.poolIn\n",
    "ttp.pool  >> tfp_voice_instrumental.poolIn\n",
    "ttp.pool  >> tfp_aggressive.poolIn\n",
    "ttp.pool  >> tfp_happy.poolIn\n",
    "\n",
    "tfp_danceability.poolOut        >> ptt_danceability.pool\n",
    "tfp_voice_instrumental.poolOut  >> ptt_voice_instrumental.pool\n",
    "tfp_aggressive.poolOut          >> ptt_aggressive.pool\n",
    "tfp_happy.poolOut               >> ptt_happy.pool\n",
    "\n",
    "ptt_danceability.tensor        >> ttv_danceability.tensor\n",
    "ptt_voice_instrumental.tensor  >> ttv_voice_instrumental.tensor\n",
    "ptt_aggressive.tensor          >> ttv_aggressive.tensor\n",
    "ptt_happy.tensor               >> ttv_happy.tensor\n",
    "\n",
    "ttv_danceability.frame         >> (pool, 'danceability')\n",
    "ttv_voice_instrumental.frame   >> (pool, 'voice_instrumental')\n",
    "ttv_aggressive.frame           >> (pool, 'aggressive')\n",
    "ttv_happy.frame                >> (pool, 'happy')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Callback to update the plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def callback(data):\n",
    "    # update audio buffer\n",
    "    buffer[:] =data.flatten()\n",
    "\n",
    "    # generate predictions\n",
    "    reset(vimp)\n",
    "    run(vimp)\n",
    "\n",
    "    # update mel and activation buffers\n",
    "    melBuffer[:] = np.roll(melBuffer, -patchSize)\n",
    "    melBuffer[:, -patchSize:] = pool['melbands'][-patchSize:,:].T\n",
    "    img_mel.set_data(melBuffer)\n",
    "    \n",
    "    actBuffer[:] = np.roll(actBuffer, -1)\n",
    "    \n",
    "    actBuffer[:, -1] = [pool['danceability'][-1, 0],\n",
    "                        pool['voice_instrumental'][-1, 1],\n",
    "                        pool['aggressive'][-1, 0],\n",
    "                        pool['happy'][-1, 0]]\n",
    "    img_act.set_data(actBuffer)\n",
    "\n",
    "    # update plots\n",
    "    f.canvas.draw()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Start processing the loopback stream"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# initialize plot buffers\n",
    "melBuffer = np.zeros([numberBands, patchSize * displaySize])\n",
    "actBuffer = np.zeros([nLabels, displaySize])\n",
    "\n",
    "# reset storage\n",
    "pool.clear()\n",
    "\n",
    "# initialize plots\n",
    "f, ax = plt.subplots(1, 2, figsize=[9.6, 7])\n",
    "f.canvas.draw()\n",
    "\n",
    "ax[0].set_title('Mel Bands')\n",
    "img_mel = ax[0].imshow(melBuffer, aspect='auto',\n",
    "                       origin='lower', vmin=0, vmax=6)\n",
    "ax[0].set_xticks([])\n",
    "\n",
    "ax[1].set_title('Activations')\n",
    "img_act = ax[1].matshow(actBuffer, aspect='auto', vmin=0, vmax=1)\n",
    "ax[1].set_xticks([])\n",
    "ax[1].yaxis.set_ticks_position('right')\n",
    "plt.yticks(np.arange(nLabels), labels, fontsize=8)\n",
    "f.colorbar(img_act, ax=ax[1], orientation='horizontal')\n",
    "\n",
    "# capture and process the speakers loopback\n",
    "with sc.all_microphones(include_loopback=True)[0].recorder(samplerate=sampleRate) as mic:\n",
    "    while True:\n",
    "        callback(mic.record(numframes=bufferSize).mean(axis=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
